In [ ]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt


x = pd.read_csv('job_descriptions.csv')

# Extract job descriptions
job_descriptions = x['Job Description']

all_descriptions = ' '.join(job_descriptions)

# Tokenize the text into words
words = word_tokenize(all_descriptions)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]

# Calculate frequency distribution
fdist = FreqDist(filtered_words)

# Plot the most common words
plt.figure(figsize=(10, 6))
fdist.plot(30, cumulative=False)
plt.title('Top 30 Most Common Words in Job Descriptions')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()
In [2]:
x.describe()
Out[2]:
Job Id latitude longitude Company Size
count 1.615940e+06 1.615940e+06 1.615940e+06 1.615940e+06
mean 1.548935e+15 1.937743e+01 1.639926e+01 7.370467e+04
std 8.946722e+14 2.355690e+01 7.066762e+01 3.529886e+04
min 1.817948e+11 -4.090060e+01 -1.751982e+02 1.264600e+04
25% 7.740508e+14 5.152100e+00 -1.531010e+01 4.311400e+04
50% 1.547858e+15 1.807080e+01 1.914510e+01 7.363300e+04
75% 2.323729e+15 3.907420e+01 4.757690e+01 1.043000e+05
max 3.099618e+15 7.170690e+01 1.780650e+02 1.348340e+05
In [4]:
x.shape
Out[4]:
(1615940, 23)
In [5]:
x.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1615940 entries, 0 to 1615939
Data columns (total 23 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Job Id            1615940 non-null  int64  
 1   Experience        1615940 non-null  object 
 2   Qualifications    1615940 non-null  object 
 3   Salary Range      1615940 non-null  object 
 4   location          1615940 non-null  object 
 5   Country           1615940 non-null  object 
 6   latitude          1615940 non-null  float64
 7   longitude         1615940 non-null  float64
 8   Work Type         1615940 non-null  object 
 9   Company Size      1615940 non-null  int64  
 10  Job Posting Date  1615940 non-null  object 
 11  Preference        1615940 non-null  object 
 12  Contact Person    1615940 non-null  object 
 13  Contact           1615940 non-null  object 
 14  Job Title         1615940 non-null  object 
 15  Role              1615940 non-null  object 
 16  Job Portal        1615940 non-null  object 
 17  Job Description   1615940 non-null  object 
 18  Benefits          1615940 non-null  object 
 19  skills            1615940 non-null  object 
 20  Responsibilities  1615940 non-null  object 
 21  Company           1615940 non-null  object 
 22  Company Profile   1610462 non-null  object 
dtypes: float64(2), int64(2), object(19)
memory usage: 283.6+ MB
In [6]:
x.dtypes
Out[6]:
Job Id                int64
Experience           object
Qualifications       object
Salary Range         object
location             object
Country              object
latitude            float64
longitude           float64
Work Type            object
Company Size          int64
Job Posting Date     object
Preference           object
Contact Person       object
Contact              object
Job Title            object
Role                 object
Job Portal           object
Job Description      object
Benefits             object
skills               object
Responsibilities     object
Company              object
Company Profile      object
dtype: object
In [ ]:
     
In [13]:
import matplotlib.pyplot as plt

plt.hist(x['Company Size'], bins=20)
plt.xlabel('Company Size')
plt.ylabel('Frequency')
plt.title('Histogram of Company Size')
plt.show()
No description has been provided for this image
In [14]:
job_id_bins = pd.qcut(x['Job Id'], q=5)
x.groupby(job_id_bins)['Company Size'].mean().plot(kind='bar')
plt.xlabel('Job ID')
plt.ylabel('Average Company Size')
plt.title('Bar Plot of Company Size by Job ID')
plt.show()
C:\Users\pc\AppData\Local\Temp\ipykernel_22428\2552711917.py:2: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  x.groupby(job_id_bins)['Company Size'].mean().plot(kind='bar')
No description has been provided for this image
In [16]:
import seaborn as sns

sns.violinplot(x='latitude', y='Company Size', data=x)
plt.xlabel('Latitude')
plt.ylabel('Company Size')
plt.title('Violin Plot of Company Size by Latitude')
plt.show()
No description has been provided for this image
In [18]:
plt.scatter(x['latitude'], x['longitude'], alpha=0.5)
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Scatter Plot of Latitude vs Longitude')
plt.show()
No description has been provided for this image
In [19]:
plt.figure(figsize=(10,6))
x['Role'].value_counts().plot(kind='bar')
plt.xlabel('Role') 
plt.ylabel('Frequency')
plt.title('Bar Plot of Role Frequency')
plt.show()
No description has been provided for this image
In [23]:
plt.hist(x['Salary Range'], bins=20)
plt.xlabel('Salary Range')
plt.ylabel('Frequency')
plt.title('Histogram of Salary Range')
plt.show()
No description has been provided for this image
In [24]:
x.columns
Out[24]:
Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')
In [26]:
x = x.drop(columns=['latitude'])
In [28]:
x = x.drop(columns=['longitude'])
In [29]:
x.columns
Out[29]:
Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'Work Type', 'Company Size', 'Job Posting Date',
       'Preference', 'Contact Person', 'Contact', 'Job Title', 'Role',
       'Job Portal', 'Job Description', 'Benefits', 'skills',
       'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')
In [12]:
plt.figure(figsize=(8,8))
x['Work Type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.ylabel('Work Type')
plt.title('Pie Chart of Work Type')
plt.show()
No description has been provided for this image
In [14]:
# Descriptive statistics for numeric columns
numeric_columns = ['Job Id', 'Salary Range', 'Company Size']
numeric_stats = x[numeric_columns].describe()
print(numeric_stats)
             Job Id  Company Size
count  1.615940e+06  1.615940e+06
mean   1.548935e+15  7.370467e+04
std    8.946722e+14  3.529886e+04
min    1.817948e+11  1.264600e+04
25%    7.740508e+14  4.311400e+04
50%    1.547858e+15  7.363300e+04
75%    2.323729e+15  1.043000e+05
max    3.099618e+15  1.348340e+05
In [15]:
# Distribution of categorical columns
categorical_columns = ['Work Type', 'Role', 'Job Portal']
for column in categorical_columns:
    print(x[column].value_counts())
Work Type
Part-Time    324044
Temporary    323439
Contract     323131
Intern       323090
Full-Time    322236
Name: count, dtype: int64
Role
Interaction Designer            20580
Network Administrator           17470
User Interface Designer         14036
Social Media Manager            13945
User Experience Designer        13935
                                ...  
Inventory Control Specialist     3342
Budget Analyst                   3335
Clinical Nurse Manager           3324
Social Science Researcher        3321
Paid Advertising Specialist      3306
Name: count, Length: 376, dtype: int64
Job Portal
FlexJobs               129879
Stack Overflow Jobs    129379
Jobs2Careers           129245
Snagajob               129088
USAJOBS                129066
SimplyHired            129059
The Muse               129033
Idealist               128952
Internships.com        128790
Monster                 65058
Dice                    64927
ZipRecruiter            64805
Indeed                  64776
CareerBuilder           64752
LinkedIn                64664
Glassdoor               64467
Name: count, dtype: int64
In [2]:
import pandas as pd
x = pd.read_csv('job_descriptions.csv')
print(x)
                   Job Id     Experience Qualifications Salary Range  \
0        1089843540111562  5 to 15 Years         M.Tech    $59K-$99K   
1         398454096642776  2 to 12 Years            BCA   $56K-$116K   
2         481640072963533  0 to 12 Years            PhD   $61K-$104K   
3         688192671473044  4 to 11 Years            PhD    $65K-$91K   
4         117057806156508  1 to 12 Years            MBA    $64K-$87K   
...                   ...            ...            ...          ...   
1615935   134563577088850  0 to 12 Years         B.Tech   $64K-$114K   
1615936   618604818190827  2 to 14 Years         M.Tech   $62K-$130K   
1615937   615471367712200  4 to 15 Years            BCA    $60K-$96K   
1615938   804137342023945  5 to 15 Years            BCA   $65K-$103K   
1615939   404645755314484  1 to 11 Years            BBA   $56K-$109K   

                  location            Country  latitude  longitude  Work Type  \
0                  Douglas        Isle of Man   54.2361    -4.5481     Intern   
1                 Ashgabat       Turkmenistan   38.9697    59.5563     Intern   
2                    Macao   Macao SAR, China   22.1987   113.5439  Temporary   
3               Porto-Novo              Benin    9.3077     2.3158  Full-Time   
4                 Santiago              Chile  -35.6751   -71.5429     Intern   
...                    ...                ...       ...        ...        ...   
1615935  Malabo (de jure),  Equatorial Guinea    1.6508    10.2679  Full-Time   
1615936             Warsaw             Poland   51.9194    19.1451     Intern   
1615937           Ashgabat       Turkmenistan   38.9697    59.5563  Part-Time   
1615938        Ouagadougou       Burkina Faso   12.2383    -1.5616  Full-Time   
1615939             Asmara            Eritrea   15.1794    39.7823  Part-Time   

         Company Size  ...                Contact  \
0               26801  ...   001-381-930-7517x737   
1              100340  ...           461-509-4216   
2               84525  ...             9687619505   
3              129896  ...  +1-820-643-5431x47576   
4               53944  ...      343.975.4702x9340   
...               ...  ...                    ...   
1615935         18281  ...           950-451-5843   
1615936         63621  ...     676.387.1572x71877   
1615937        114287  ...      537.384.6193x5284   
1615938         45009  ...     (484)257-4755x5346   
1615939         87637  ...          (989)703-9723   

                            Job Title                        Role  \
0        Digital Marketing Specialist        Social Media Manager   
1                       Web Developer      Frontend Web Developer   
2                  Operations Manager     Quality Control Manager   
3                    Network Engineer   Wireless Network Engineer   
4                       Event Manager          Conference Manager   
...                               ...                         ...   
1615935           Mechanical Engineer  Mechanical Design Engineer   
1615936                    IT Manager                 IT Director   
1615937           Mechanical Engineer  Mechanical Design Engineer   
1615938                HR Coordinator        Training Coordinator   
1615939                 Event Planner             Wedding Planner   

                  Job Portal  \
0                   Snagajob   
1                   Idealist   
2               Jobs2Careers   
3                   FlexJobs   
4               Jobs2Careers   
...                      ...   
1615935         ZipRecruiter   
1615936              USAJOBS   
1615937               Indeed   
1615938  Stack Overflow Jobs   
1615939              USAJOBS   

                                           Job Description  \
0        Social Media Managers oversee an organizations...   
1        Frontend Web Developers design and implement u...   
2        Quality Control Managers establish and enforce...   
3        Wireless Network Engineers design, implement, ...   
4        A Conference Manager coordinates and manages c...   
...                                                    ...   
1615935  Mechanical Design Engineers create and develop...   
1615936  An IT Director oversees an organizations IT de...   
1615937  Mechanical Design Engineers create and develop...   
1615938  Training Coordinators design and implement emp...   
1615939  Wedding Planners specialize in organizing wedd...   

                                                  Benefits  \
0        {'Flexible Spending Accounts (FSAs), Relocatio...   
1        {'Health Insurance, Retirement Plans, Paid Tim...   
2        {'Legal Assistance, Bonuses and Incentive Prog...   
3        {'Transportation Benefits, Professional Develo...   
4        {'Flexible Spending Accounts (FSAs), Relocatio...   
...                                                    ...   
1615935  {'Employee Assistance Programs (EAP), Tuition ...   
1615936  {'Health Insurance, Retirement Plans, Paid Tim...   
1615937  {'Tuition Reimbursement, Stock Options or Equi...   
1615938  {'Casual Dress Code, Social and Recreational A...   
1615939  {'Transportation Benefits, Professional Develo...   

                                                    skills  \
0        Social media platforms (e.g., Facebook, Twitte...   
1        HTML, CSS, JavaScript Frontend frameworks (e.g...   
2        Quality control processes and methodologies St...   
3        Wireless network design and architecture Wi-Fi...   
4        Event planning Conference logistics Budget man...   
...                                                    ...   
1615935  Mechanical engineering CAD software (e.g., Sol...   
1615936  Strategic IT planning Leadership and managemen...   
1615937  Mechanical engineering CAD software (e.g., Sol...   
1615938  Training program coordination Training materia...   
1615939  Wedding planning Venue selection Catering and ...   

                                          Responsibilities  \
0        Manage and grow social media accounts, create ...   
1        Design and code user interfaces for websites, ...   
2        Establish and enforce quality control standard...   
3        Design, configure, and optimize wireless netwo...   
4        Specialize in conference and convention planni...   
...                                                    ...   
1615935  Design mechanical systems, components, and pro...   
1615936  Provide strategic leadership for IT department...   
1615937  Design mechanical systems, components, and pro...   
1615938  Coordinate employee training programs, track t...   
1615939  Specialize in wedding planning, assisting coup...   

                                  Company  \
0                       Icahn Enterprises   
1            PNC Financial Services Group   
2        United Services Automobile Assn.   
3                                    Hess   
4                            Cairn Energy   
...                                   ...   
1615935               The Hershey Company   
1615936                               EQT   
1615937                               KLA   
1615938               Mahindra & Mahindra   
1615939                     Ashtead Group   

                                           Company Profile  
0        {"Sector":"Diversified","Industry":"Diversifie...  
1        {"Sector":"Financial Services","Industry":"Com...  
2        {"Sector":"Insurance","Industry":"Insurance: P...  
3        {"Sector":"Energy","Industry":"Mining, Crude-O...  
4        {"Sector":"Energy","Industry":"Energy - Oil & ...  
...                                                    ...  
1615935  {"Sector":"Food and Beverage/Confectionery","I...  
1615936  {"Sector":"Energy","Industry":"Energy","City":...  
1615937  {"Sector":"Technology","Industry":"Semiconduct...  
1615938  {"Sector":"Automotive","Industry":"Automotive"...  
1615939  {"Sector":"Equipment Rental","Industry":"Equip...  

[1615940 rows x 23 columns]
In [3]:
import matplotlib.pyplot as pd

# Your plotting code
plt.boxplot(x['Experience'])
plt.ylabel('Experience')
plt.title('Box Plot of Experience')
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[3], line 4
      1 import matplotlib.pyplot as pd
      3 # Your plotting code
----> 4 plt.boxplot(x['Experience'])
      5 plt.ylabel('Experience')
      6 plt.title('Box Plot of Experience')

NameError: name 'x' is not defined
In [6]:
import pandas as pd
x = pd.read_csv('job_descriptions.csv')
plt.boxplot(x['Company Size'])
plt.ylabel('Company Size')
plt.title('Box Plot of Company Size')
plt.show()
No description has been provided for this image
In [5]:
import pandas as pd


# Load dataset
data = pd.read_csv("your_dataset.csv")

# Descriptive analysis
print("Summary Statistics:")
print(data.describe())

print("\nUnique Values and Frequency Distributions:")
for column in data.columns:
    print(column)
    print(data[column].value_counts())
    print()

# Text analysis of job descriptions
# Example: Sentiment analysis
sentiments = [TextBlob(text).sentiment.polarity for text in data["Job Description"]]
data["Sentiment"] = sentiments

# Company profile analysis
# Example: Analyze company profiles to understand company reputation, values, and culture
# You can use techniques like keyword extraction and sentiment analysis for this

# Further analysis and visualization as needed
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[5], line 5
      1 import pandas as pd
      4 # Load dataset
----> 5 data = pd.read_csv("your_dataset.csv")
      7 # Descriptive analysis
      8 print("Summary Statistics:")

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1705, in TextFileReader._make_engine(self, f, engine)
   1703     if "b" not in mode:
   1704         mode += "b"
-> 1705 self.handles = get_handle(
   1706     f,
   1707     mode,
   1708     encoding=self.options.get("encoding", None),
   1709     compression=self.options.get("compression", None),
   1710     memory_map=self.options.get("memory_map", False),
   1711     is_text=is_text,
   1712     errors=self.options.get("encoding_errors", "strict"),
   1713     storage_options=self.options.get("storage_options", None),
   1714 )
   1715 assert self.handles is not None
   1716 f = self.handles.handle

File ~\anaconda3\Lib\site-packages\pandas\io\common.py:863, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    858 elif isinstance(handle, str):
    859     # Check whether the filename is to be opened in binary mode.
    860     # Binary mode does not support 'encoding' and 'newline'.
    861     if ioargs.encoding and "b" not in ioargs.mode:
    862         # Encoding
--> 863         handle = open(
    864             handle,
    865             ioargs.mode,
    866             encoding=ioargs.encoding,
    867             errors=errors,
    868             newline="",
    869         )
    870     else:
    871         # Binary mode
    872         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'your_dataset.csv'
In [7]:
!pip install spacy
Collecting spacy
  Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy)
  Downloading thinc-8.2.3-cp311-cp311-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.4.8-cp311-cp311-win_amd64.whl.metadata (20 kB)
Collecting catalogue<2.1.0,>=2.0.6 (from spacy)
  Downloading catalogue-2.0.10-py3-none-any.whl.metadata (14 kB)
Collecting weasel<0.4.0,>=0.1.0 (from spacy)
  Downloading weasel-0.3.4-py3-none-any.whl.metadata (4.7 kB)
Collecting typer<0.10.0,>=0.3.0 (from spacy)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (5.2.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (4.65.0)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (1.10.12)
Requirement already satisfied: jinja2 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (3.1.3)
Requirement already satisfied: setuptools in c:\users\pc\anaconda3\lib\site-packages (from spacy) (68.2.2)
Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (23.1)
Collecting langcodes<4.0.0,>=3.2.0 (from spacy)
  Downloading langcodes-3.4.0-py3-none-any.whl.metadata (29 kB)
Requirement already satisfied: numpy>=1.19.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy) (1.26.4)
Collecting language-data>=1.2 (from langcodes<4.0.0,>=3.2.0->spacy)
  Downloading language_data-1.2.0-py3-none-any.whl.metadata (4.3 kB)
Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\pc\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.9.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2)
Collecting blis<0.8.0,>=0.7.8 (from thinc<8.3.0,>=8.2.2->spacy)
  Downloading blis-0.7.11-cp311-cp311-win_amd64.whl.metadata (7.6 kB)
Collecting confection<1.0.0,>=0.0.1 (from thinc<8.3.0,>=8.2.2->spacy)
  Downloading confection-0.1.4-py3-none-any.whl.metadata (19 kB)
Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.38.0->spacy) (0.4.6)
Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\users\pc\anaconda3\lib\site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)
Collecting cloudpathlib<0.17.0,>=0.7.0 (from weasel<0.4.0,>=0.1.0->spacy)
  Downloading cloudpathlib-0.16.0-py3-none-any.whl.metadata (14 kB)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\pc\anaconda3\lib\site-packages (from jinja2->spacy) (2.1.3)
Collecting marisa-trie>=0.7.7 (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy)
  Downloading marisa_trie-1.1.0-cp311-cp311-win_amd64.whl.metadata (8.8 kB)
Downloading spacy-3.7.4-cp311-cp311-win_amd64.whl (12.1 MB)
   ---------------------------------------- 0.0/12.1 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.1 MB 16.5 MB/s eta 0:00:01
   ----- ---------------------------------- 1.8/12.1 MB 22.7 MB/s eta 0:00:01
   ---------- ----------------------------- 3.1/12.1 MB 24.7 MB/s eta 0:00:01
   --------------- ------------------------ 4.8/12.1 MB 25.4 MB/s eta 0:00:01
   --------------------- ------------------ 6.5/12.1 MB 29.8 MB/s eta 0:00:01
   ------------------------- -------------- 7.8/12.1 MB 29.4 MB/s eta 0:00:01
   -------------------------------- ------- 9.7/12.1 MB 31.0 MB/s eta 0:00:01
   ------------------------------------- -- 11.2/12.1 MB 34.6 MB/s eta 0:00:01
   ---------------------------------------  12.1/12.1 MB 34.4 MB/s eta 0:00:01
   ---------------------------------------- 12.1/12.1 MB 29.7 MB/s eta 0:00:00
Downloading catalogue-2.0.10-py3-none-any.whl (17 kB)
Downloading cymem-2.0.8-cp311-cp311-win_amd64.whl (39 kB)
Downloading langcodes-3.4.0-py3-none-any.whl (182 kB)
   ---------------------------------------- 0.0/182.0 kB ? eta -:--:--
   --------------------------------------- 182.0/182.0 kB 10.7 MB/s eta 0:00:00
Downloading murmurhash-1.0.10-cp311-cp311-win_amd64.whl (25 kB)
Downloading preshed-3.0.9-cp311-cp311-win_amd64.whl (122 kB)
   ---------------------------------------- 0.0/122.3 kB ? eta -:--:--
   ---------------------------------------- 122.3/122.3 kB ? eta 0:00:00
Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Downloading srsly-2.4.8-cp311-cp311-win_amd64.whl (479 kB)
   ---------------------------------------- 0.0/479.7 kB ? eta -:--:--
   --------------------------------------- 479.7/479.7 kB 14.7 MB/s eta 0:00:00
Downloading thinc-8.2.3-cp311-cp311-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------  1.5/1.5 MB 31.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 23.3 MB/s eta 0:00:00
Downloading typer-0.9.4-py3-none-any.whl (45 kB)
   ---------------------------------------- 0.0/46.0 kB ? eta -:--:--
   ---------------------------------------- 46.0/46.0 kB 2.2 MB/s eta 0:00:00
Downloading wasabi-1.1.2-py3-none-any.whl (27 kB)
Downloading weasel-0.3.4-py3-none-any.whl (50 kB)
   ---------------------------------------- 0.0/50.1 kB ? eta -:--:--
   ---------------------------------------- 50.1/50.1 kB 2.5 MB/s eta 0:00:00
Downloading blis-0.7.11-cp311-cp311-win_amd64.whl (6.6 MB)
   ---------------------------------------- 0.0/6.6 MB ? eta -:--:--
   -------- ------------------------------- 1.5/6.6 MB 47.6 MB/s eta 0:00:01
   ------------------- -------------------- 3.2/6.6 MB 40.6 MB/s eta 0:00:01
   ------------------------------ --------- 5.1/6.6 MB 40.6 MB/s eta 0:00:01
   -------------------------------------- - 6.4/6.6 MB 37.4 MB/s eta 0:00:01
   ---------------------------------------- 6.6/6.6 MB 32.5 MB/s eta 0:00:00
Downloading cloudpathlib-0.16.0-py3-none-any.whl (45 kB)
   ---------------------------------------- 0.0/45.0 kB ? eta -:--:--
   ---------------------------------------- 45.0/45.0 kB ? eta 0:00:00
Downloading confection-0.1.4-py3-none-any.whl (35 kB)
Downloading language_data-1.2.0-py3-none-any.whl (5.4 MB)
   ---------------------------------------- 0.0/5.4 MB ? eta -:--:--
   ------------ --------------------------- 1.7/5.4 MB 35.5 MB/s eta 0:00:01
   ------------------------ --------------- 3.3/5.4 MB 34.9 MB/s eta 0:00:01
   -------------------------------------- - 5.2/5.4 MB 36.9 MB/s eta 0:00:01
   ---------------------------------------  5.4/5.4 MB 38.3 MB/s eta 0:00:01
   ---------------------------------------- 5.4/5.4 MB 26.5 MB/s eta 0:00:00
Downloading marisa_trie-1.1.0-cp311-cp311-win_amd64.whl (152 kB)
   ---------------------------------------- 0.0/152.6 kB ? eta -:--:--
   ---------------------------------------- 152.6/152.6 kB 8.9 MB/s eta 0:00:00
Installing collected packages: cymem, wasabi, spacy-loggers, spacy-legacy, murmurhash, marisa-trie, cloudpathlib, catalogue, blis, typer, srsly, preshed, language-data, langcodes, confection, weasel, thinc, spacy
Successfully installed blis-0.7.11 catalogue-2.0.10 cloudpathlib-0.16.0 confection-0.1.4 cymem-2.0.8 langcodes-3.4.0 language-data-1.2.0 marisa-trie-1.1.0 murmurhash-1.0.10 preshed-3.0.9 spacy-3.7.4 spacy-legacy-3.0.12 spacy-loggers-1.0.5 srsly-2.4.8 thinc-8.2.3 typer-0.9.4 wasabi-1.1.2 weasel-0.3.4
In [8]:
!python -m spacy download en_core_web_sm
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 330.3 kB/s eta 0:00:39
     --------------------------------------- 0.1/12.8 MB 469.7 kB/s eta 0:00:28
     - -------------------------------------- 0.4/12.8 MB 2.2 MB/s eta 0:00:06
     ----- ---------------------------------- 1.6/12.8 MB 7.9 MB/s eta 0:00:02
     --------- ------------------------------ 3.0/12.8 MB 12.1 MB/s eta 0:00:01
     -------------- ------------------------- 4.7/12.8 MB 15.8 MB/s eta 0:00:01
     -------------------- ------------------- 6.6/12.8 MB 19.3 MB/s eta 0:00:01
     ------------------------ --------------- 8.0/12.8 MB 20.5 MB/s eta 0:00:01
     ------------------------------ --------- 9.8/12.8 MB 22.5 MB/s eta 0:00:01
     -------------------------------- ------ 10.8/12.8 MB 34.4 MB/s eta 0:00:01
     ---------------------------------- ---- 11.4/12.8 MB 29.7 MB/s eta 0:00:01
     ------------------------------------ -- 12.1/12.8 MB 28.4 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 28.5 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 28.5 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 28.5 MB/s eta 0:00:01
     --------------------------------------  12.8/12.8 MB 28.5 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 18.7 MB/s eta 0:00:00
Requirement already satisfied: spacy<3.8.0,>=3.7.2 in c:\users\pc\anaconda3\lib\site-packages (from en-core-web-sm==3.7.1) (3.7.4)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)
Requirement already satisfied: thinc<8.3.0,>=8.2.2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)
Requirement already satisfied: weasel<0.4.0,>=0.1.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)
Requirement already satisfied: typer<0.10.0,>=0.3.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)
Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (5.2.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.65.0)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.10.12)
Requirement already satisfied: jinja2 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.3)
Requirement already satisfied: setuptools in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (68.2.2)
Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (23.1)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4.0)
Requirement already satisfied: numpy>=1.19.0 in c:\users\pc\anaconda3\lib\site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)
Requirement already satisfied: language-data>=1.2 in c:\users\pc\anaconda3\lib\site-packages (from langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.2.0)
Requirement already satisfied: typing-extensions>=4.2.0 in c:\users\pc\anaconda3\lib\site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.9.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\pc\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in c:\users\pc\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in c:\users\pc\anaconda3\lib\site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)
Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from tqdm<5.0.0,>=4.38.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.4.6)
Requirement already satisfied: click<9.0.0,>=7.1.1 in c:\users\pc\anaconda3\lib\site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)
Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in c:\users\pc\anaconda3\lib\site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)
Requirement already satisfied: MarkupSafe>=2.0 in c:\users\pc\anaconda3\lib\site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.3)
Requirement already satisfied: marisa-trie>=0.7.7 in c:\users\pc\anaconda3\lib\site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.0)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
In [9]:
import pandas as pd
import spacy
from collections import Counter
from textblob import TextBlob

# Load dataset
data = pd.read_csv("your_dataset.csv")

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Keyword extraction
skills_responsibilities = []
company_values = []
for text in data["Job Description"]:
    doc = nlp(text)
    for token in doc:
        if token.pos_ == "NOUN":
            if token.dep_ == "dobj" or token.dep_ == "attr":
                skills_responsibilities.append(token.text)
            elif token.dep_ == "nsubj" or token.dep_ == "pobj":
                company_values.append(token.text)

# Sentiment analysis
sentiments = [TextBlob(text).sentiment.polarity for text in data["Job Description"]]
data["Sentiment"] = sentiments

# Optimization for search engine visibility (Not implemented in this example)

# Further analysis and visualization as needed
print("Common Skills and Responsibilities:")
print(Counter(skills_responsibilities).most_common(10))

print("\nCompany Values:")
print(Counter(company_values).most_common(5))

print("\nSentiment Analysis:")
print(data[["Job Description", "Sentiment"]])
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[9], line 4
      2 import spacy
      3 from collections import Counter
----> 4 from textblob import TextBlob
      6 # Load dataset
      7 data = pd.read_csv("your_dataset.csv")

ModuleNotFoundError: No module named 'textblob'
In [11]:
!pip install textblob
Requirement already satisfied: textblob in c:\users\pc\anaconda3\lib\site-packages (0.18.0.post0)
Requirement already satisfied: nltk>=3.8 in c:\users\pc\anaconda3\lib\site-packages (from textblob) (3.8.1)
Requirement already satisfied: click in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (8.1.7)
Requirement already satisfied: joblib in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (2023.10.3)
Requirement already satisfied: tqdm in c:\users\pc\anaconda3\lib\site-packages (from nltk>=3.8->textblob) (4.65.0)
Requirement already satisfied: colorama in c:\users\pc\anaconda3\lib\site-packages (from click->nltk>=3.8->textblob) (0.4.6)
In [12]:
import pandas as pd
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Read the CSV file into a DataFrame
df = pd.read_csv('your_data.csv')

# Tokenize the text in the 'Job Description' column
descriptions = df['Job Description'].dropna().str.split()

# Flatten the list of lists and count the frequency of words
word_counts = Counter([word for sublist in descriptions for word in sublist])

# Plot word frequency using a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Job Description')
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[12], line 3
      1 import pandas as pd
      2 from collections import Counter
----> 3 from wordcloud import WordCloud
      4 import matplotlib.pyplot as plt
      6 # Read the CSV file into a DataFrame

ModuleNotFoundError: No module named 'wordcloud'
In [ ]:
 import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt

# Load the dataset
x = pd.read_csv('job_descriptions.csv')

# Extract job descriptions
job_descriptions = x['Job Description']

# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(job_descriptions)

# Tokenize the text into words
words = word_tokenize(all_descriptions)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalpha()]

# Calculate frequency distribution
fdist = FreqDist(filtered_words)

# Plot the most common words
plt.figure(figsize=(10, 6))
fdist.plot(30, cumulative=False)
plt.title('Top 30 Most Common Words in Job Descriptions')
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()
In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('job_descriptions.csv')

# Plotting skills
skills_counts = data['skills'].value_counts().head(10)
plt.figure(figsize=(10,6))
skills_counts.plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Required Skills')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Plotting responsibilities
responsibilities_counts = data['Responsibilities'].value_counts().head(10)
plt.figure(figsize=(10,6))
responsibilities_counts.plot(kind='bar', color='lightgreen')
plt.title('Top 10 Most Common Responsibilities')
plt.xlabel('Responsibilities')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Plotting companies
company_counts = data['Company'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_counts.plot(kind='bar', color='salmon')
plt.title('Top 10 Companies with Most Job Openings')
plt.xlabel('Company')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Plotting company profiles
company_profile_counts = data['Company Profile'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_profile_counts.plot(kind='bar', color='gold')
plt.title('Top 10 Most Common Company Profiles')
plt.xlabel('Company Profile')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [2]:
# Plotting skills
skills_counts = data['skills'].value_counts().head(10)
plt.figure(figsize=(10,6))
skills_counts.plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Required Skills')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [3]:
# Plotting responsibilities
responsibilities_counts = data['Responsibilities'].value_counts().head(10)
plt.figure(figsize=(10,6))
responsibilities_counts.plot(kind='bar', color='lightgreen')
plt.title('Top 10 Most Common Responsibilities')
plt.xlabel('Responsibilities')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [4]:
# Plotting companies
company_counts = data['Company'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_counts.plot(kind='bar', color='salmon')
plt.title('Top 10 Companies with Most Job Openings')
plt.xlabel('Company')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [5]:
# Plotting company profiles
company_profile_counts = data['Company Profile'].value_counts().head(10)
plt.figure(figsize=(10,6))
company_profile_counts.plot(kind='bar', color='gold')
plt.title('Top 10 Most Common Company Profiles')
plt.xlabel('Company Profile')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob

# Example of keyword extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(x['skills'].dropna())
tfidf_features = tfidf_vectorizer.get_feature_names_out()
print("Top 10 keywords for skills:")
print(tfidf_features[:10])

# Example of sentiment analysis using TextBlob
x['Job_Description_Sentiment'] = x['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
print("Average sentiment of job descriptions:", x['Job_Description_Sentiment'].mean())
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 6
      4 # Example of keyword extraction using TF-IDF
      5 tfidf_vectorizer = TfidfVectorizer(max_features=1000)
----> 6 tfidf_matrix = tfidf_vectorizer.fit_transform(x['skills'].dropna())
      7 tfidf_features = tfidf_vectorizer.get_feature_names_out()
      8 print("Top 10 keywords for skills:")

NameError: name 'x' is not defined
In [2]:
#Analyze skill gaps and match candidate profiles to job requirements.
Identify emerging skills and responsibilities in specific industries.
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Load the dataset
data = pd.read_csv('job_descriptions.csv')

# Tokenize and extract skills from job descriptions
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X = vectorizer.fit_transform(data['Job Description'].dropna())
skills = vectorizer.get_feature_names_out()

# Categorize skills into relevant categories
technical_skills = []
soft_skills = []

for skill in skills:
    if 'technical' in skill.lower() or 'programming' in skill.lower():
        technical_skills.append(skill)
    else:
        soft_skills.append(skill)

print("Technical Skills:", technical_skills)
print("Soft Skills:", soft_skills)
Technical Skills: ['provide technical', 'technical', 'technical support']
Soft Skills: ['account', 'accounting', 'accounts', 'accurate', 'achieve', 'acquisitions', 'across', 'activities', 'address', 'administrative', 'administrative support', 'administrative tasks', 'administrators', 'administrators manage', 'advertising', 'advice', 'advise', 'advisors', 'aesthetically', 'aesthetically pleasing', 'aesthetics', 'agile', 'align', 'align with', 'all', 'also', 'an', 'an organization', 'an organizations', 'analysis', 'analysis and', 'analyst', 'analysts', 'analysts analyze', 'analyze', 'analyze data', 'analyze financial', 'analyze social', 'analyzes', 'and', 'and analysis', 'and analyze', 'and analyzes', 'and applications', 'and assisting', 'and collaborate', 'and customer', 'and data', 'and design', 'and develop', 'and develops', 'and educate', 'and efficiency', 'and efficient', 'and engagement', 'and engaging', 'and enhance', 'and ensure', 'and ensures', 'and ensuring', 'and execute', 'and financial', 'and functional', 'and handling', 'and implement', 'and implementing', 'and improve', 'and interactive', 'and layouts', 'and maintain', 'and maintaining', 'and manage', 'and manages', 'and managing', 'and market', 'and meet', 'and optimize', 'and other', 'and oversee', 'and performance', 'and promote', 'and provide', 'and quality', 'and recommendations', 'and regulations', 'and respond', 'and responsive', 'and schedule', 'and security', 'and server', 'and services', 'and software', 'and support', 'and system', 'and systems', 'and treat', 'and user', 'and visually', 'and within', 'and work', 'answer', 'appealing', 'appealing and', 'appealing user', 'application', 'applications', 'applications they', 'architect', 'architectural', 'are', 'are responsible', 'art', 'artistic', 'as', 'aspects', 'aspects of', 'assess', 'assesses', 'assist', 'assist in', 'assist with', 'assistance', 'assistant', 'assisting', 'assists', 'assurance', 'audiences', 'automated', 'automation', 'availability', 'awareness', 'awareness and', 'backend', 'behavior', 'behaviors', 'behaviors and', 'being', 'benefit', 'best', 'between', 'brand', 'brand awareness', 'budget', 'budgeting', 'budgets', 'budgets and', 'build', 'build and', 'build relationships', 'building', 'building and', 'buildings', 'business', 'business decisions', 'business objectives', 'businesses', 'buttons', 'buttons and', 'by', 'by considering', 'campaigns', 'campaigns they', 'care', 'care and', 'care to', 'cases', 'chain', 'channels', 'child', 'children', 'client', 'clients', 'clients in', 'clients or', 'clinical', 'closely', 'closely with', 'cloud', 'code', 'cohesive', 'cohesive and', 'collaborate', 'collaborate with', 'collect', 'communicate', 'communication', 'communication and', 'communication to', 'community', 'company', 'companys', 'competitive', 'complex', 'compliance', 'compliance with', 'components', 'computer', 'computer networks', 'conditions', 'conduct', 'conduct keyword', 'conduct user', 'conducts', 'conferences', 'configure', 'configure monitor', 'considering', 'considering user', 'consistency', 'construction', 'consultant', 'consumer', 'consumer behavior', 'content', 'content and', 'content engage', 'contracts', 'contracts and', 'control', 'coordinate', 'coordinates', 'coordinating', 'coordinators', 'corporate', 'correspondence', 'cost', 'cost effective', 'cost saving', 'costs', 'counsel', 'create', 'create and', 'create intuitive', 'create meaningful', 'create visually', 'creating', 'creating user', 'cross', 'cross functional', 'custody', 'customer', 'customer satisfaction', 'customer success', 'customer support', 'customers', 'customers they', 'daily', 'data', 'data analysis', 'data and', 'data driven', 'data identify', 'data integrity', 'data storage', 'data to', 'database', 'databases', 'databases ensuring', 'day', 'decision', 'decision making', 'decisions', 'defects', 'delivery', 'demand', 'departments', 'design', 'design and', 'design implement', 'design layouts', 'design prototypes', 'designer', 'designers', 'designers create', 'designers focus', 'designers specialize', 'designing', 'designing and', 'designing user', 'designs', 'designs and', 'develop', 'develop and', 'developer', 'developer is', 'developers', 'developing', 'development', 'development and', 'development teams', 'develops', 'devices', 'diagnose', 'diagnose and', 'different', 'digital', 'digital interfaces', 'digital marketing', 'directors', 'disputes', 'distribution', 'document', 'documentation', 'documents', 'drive', 'drive brand', 'driven', 'driven decision', 'educate', 'effective', 'effectively', 'efficiency', 'efficiency and', 'efficient', 'efforts', 'electrical', 'elements', 'elements to', 'email', 'email marketing', 'employee', 'employees', 'employment', 'end', 'engage', 'engage with', 'engagement', 'engaging', 'engaging user', 'engine', 'engineer', 'engineers', 'engineers design', 'engineers focus', 'engines', 'enhance', 'enhance the', 'enhancing', 'ensure', 'ensure cohesive', 'ensure compliance', 'ensure efficient', 'ensure product', 'ensure the', 'ensure they', 'ensures', 'ensuring', 'ensuring compliance', 'ensuring data', 'ensuring optimal', 'ensuring seamless', 'ensuring they', 'environment', 'environmental', 'environments', 'evaluate', 'event', 'events', 'events including', 'execute', 'executives', 'experience', 'experience designers', 'experience of', 'experiences', 'experiences by', 'expert', 'expertise', 'facilitate', 'facilities', 'families', 'family', 'financial', 'financial goals', 'financial planning', 'findings', 'flow', 'focus', 'focus on', 'focuses', 'focuses on', 'focusing', 'focusing on', 'followers', 'followers and', 'for', 'for search', 'for various', 'for web', 'fostering', 'friendly', 'friendly and', 'friendly digital', 'from', 'front', 'front end', 'frontend', 'functional', 'functional teams', 'functionality', 'functionality and', 'gather', 'generate', 'goals', 'goals and', 'goods', 'goods and', 'graphics', 'growth', 'guidance', 'guide', 'handle', 'handling', 'hardware', 'hardware and', 'health', 'healthcare', 'help', 'helping', 'helping them', 'high', 'hr', 'identify', 'identify cost', 'identify defects', 'identify trends', 'identifying', 'impact', 'implement', 'implement and', 'implement security', 'implementing', 'improve', 'improve online', 'improving', 'in', 'in creating', 'in designing', 'in legal', 'in planning', 'in specific', 'in the', 'incidents', 'include', 'including', 'including servers', 'incorporate', 'increase', 'individuals', 'industrial', 'inform', 'information', 'informed', 'infrastructure', 'infrastructure ensuring', 'infrastructure including', 'initiatives', 'inquiries', 'insights', 'insights and', 'insights to', 'integration', 'integrity', 'integrity and', 'interaction', 'interaction designers', 'interactions', 'interactions within', 'interactive', 'interactive aspects', 'interface', 'interface designers', 'interfaces', 'interfaces for', 'interfaces they', 'interior', 'into', 'intuitive', 'intuitive and', 'inventory', 'inventory levels', 'investment', 'involves', 'is', 'is responsible', 'issues', 'issues and', 'it', 'it infrastructure', 'it systems', 'java', 'keyword', 'keyword research', 'knowledge', 'landscape', 'landscapes', 'language', 'law', 'laws', 'laws and', 'layouts', 'layouts buttons', 'lead', 'leads', 'learning', 'legal', 'legal counsel', 'legal matters', 'legal proceedings', 'level', 'levels', 'life', 'like', 'litigation', 'logic', 'logic and', 'logistics', 'logistics and', 'maintain', 'maintain data', 'maintain network', 'maintaining', 'maintenance', 'make', 'make informed', 'making', 'making within', 'manage', 'manage an', 'manage and', 'manage budgets', 'management', 'management and', 'manager', 'manager is', 'manager oversees', 'managers', 'managers oversee', 'manages', 'managing', 'managing schedules', 'manufacturing', 'market', 'market research', 'market trends', 'marketing', 'marketing campaigns', 'marketing efforts', 'marketing strategies', 'materials', 'matters', 'matters related', 'maximize', 'may', 'meaningful', 'meaningful and', 'measures', 'media', 'media managers', 'media metrics', 'media presence', 'media strategies', 'medical', 'medical care', 'medical conditions', 'meet', 'meet quality', 'meetings', 'meetings and', 'members', 'mergers', 'metrics', 'metrics to', 'minimize', 'mobile', 'models', 'monitor', 'monitor and', 'needs', 'needs and', 'needs of', 'negotiate', 'negotiate contracts', 'network', 'network administrators', 'network infrastructure', 'network performance', 'networks', 'networks and', 'networks they', 'new', 'nurse', 'nurse practitioners', 'nursing', 'objectives', 'of', 'of digital', 'of goods', 'of software', 'of web', 'of websites', 'offer', 'office', 'office operations', 'often', 'on', 'on the', 'on time', 'online', 'online visibility', 'operations', 'opportunities', 'opportunities and', 'optimal', 'optimal performance', 'optimize', 'optimize websites', 'optimizing', 'or', 'or products', 'or services', 'oral', 'organic', 'organic traffic', 'organization', 'organization they', 'organizational', 'organizations', 'organizations computer', 'organizations it', 'organizations social', 'organizing', 'other', 'other elements', 'overall', 'overall user', 'oversee', 'oversee an', 'oversees', 'pages', 'parties', 'patient', 'patients', 'pediatric', 'perform', 'performance', 'performance and', 'performance they', 'performing', 'personal', 'persuasive', 'plan', 'plan and', 'planners', 'planning', 'planning and', 'plans', 'plans and', 'platforms', 'pleasing', 'policies', 'portfolio', 'portfolios', 'positive', 'potential', 'power', 'practices', 'practitioners', 'preferences', 'preparation', 'presence', 'presence they', 'preventive', 'preventive care', 'problems', 'procedures', 'proceedings', 'process', 'processes', 'processes to', 'procurement', 'procurement processes', 'product', 'product availability', 'product quality', 'production', 'products', 'products or', 'products they', 'products to', 'professionals', 'programs', 'programs and', 'project', 'projects', 'promote', 'promoting', 'protect', 'prototypes', 'prototypes and', 'provide', 'provide insights', 'provide legal', 'provides', 'providing', 'public', 'purchasing', 'qa', 'quality', 'quality assurance', 'quality control', 'quality in', 'quality standards', 'rankings', 'reach', 'recommendations', 'recommendations for', 'recommendations to', 'records', 'recruitment', 'regulations', 'related', 'related to', 'relations', 'relationships', 'relationships with', 'reliability', 'reliable', 'report', 'reporting', 'reports', 'reports and', 'representatives', 'requirements', 'research', 'research and', 'research design', 'research optimize', 'resolve', 'resolve problems', 'resource', 'resources', 'respond', 'respond to', 'responses', 'responsible', 'responsible for', 'responsive', 'retirement', 'revenue', 'risks', 'role', 'safety', 'safety and', 'sales', 'sales and', 'sales representatives', 'sales targets', 'sales teams', 'satisfaction', 'saving', 'saving opportunities', 'schedule', 'schedule content', 'schedules', 'scheduling', 'scope', 'seamless', 'search', 'search engine', 'search engines', 'security', 'security and', 'security measures', 'seo', 'seo specialists', 'seo strategies', 'server', 'server side', 'servers', 'service', 'services', 'services they', 'services to', 'settings', 'side', 'skills', 'skills to', 'smooth', 'social', 'social media', 'software', 'software and', 'software applications', 'solutions', 'solutions and', 'solutions to', 'spaces', 'specialist', 'specialists', 'specialists focus', 'specialists optimize', 'specialize', 'specialize in', 'specializes', 'specializes in', 'specific', 'speech', 'staff', 'stakeholders', 'standards', 'standards they', 'stock', 'storage', 'strategic', 'strategies', 'strategies and', 'strategies they', 'strategies to', 'strategists', 'strategy', 'streamline', 'structures', 'students', 'success', 'such', 'such as', 'supply', 'supply chain', 'support', 'support data', 'support decision', 'support specialists', 'support the', 'support to', 'sustainability', 'sustainable', 'system', 'system responses', 'systems', 'systems and', 'target', 'target audiences', 'targets', 'tasks', 'tax', 'tax laws', 'tax planning', 'team', 'teams', 'teams to', 'techniques', 'technology', 'test', 'testing', 'tests', 'that', 'the', 'the company', 'the organization', 'the organizations', 'the overall', 'the server', 'the visual', 'their', 'their needs', 'them', 'therapist', 'therapy', 'they', 'they analyze', 'they assess', 'they build', 'they collaborate', 'they conduct', 'they configure', 'they create', 'they design', 'they develop', 'they diagnose', 'they ensure', 'they handle', 'they identify', 'they meet', 'they optimize', 'they plan', 'they provide', 'they track', 'they use', 'they work', 'threats', 'through', 'time', 'time and', 'to', 'to achieve', 'to create', 'to customers', 'to drive', 'to end', 'to enhance', 'to ensure', 'to improve', 'to increase', 'to individuals', 'to inform', 'to maintain', 'to maximize', 'to meet', 'to optimize', 'to promote', 'to provide', 'to streamline', 'to support', 'to the', 'tools', 'track', 'track performance', 'traffic', 'traffic and', 'training', 'training programs', 'transactions', 'transportation', 'treat', 'treatment', 'trends', 'trends and', 'troubleshoot', 'troubleshooting', 'troubleshooting issues', 'ui', 'ui ux', 'understand', 'understand their', 'use', 'use their', 'user', 'user behaviors', 'user experience', 'user experiences', 'user friendly', 'user interactions', 'user interface', 'user interfaces', 'user research', 'users', 'using', 'ux', 'valuable', 'valuable insights', 'various', 'vendor', 'vendor relationships', 'visibility', 'visual', 'visual and', 'visually', 'visually appealing', 'water', 'web', 'web applications', 'web pages', 'website', 'websites', 'websites and', 'websites for', 'websites they', 'wedding', 'weddings', 'well', 'well being', 'while', 'will', 'with', 'with clients', 'with cross', 'with development', 'with followers', 'with legal', 'with tax', 'with the', 'within', 'within an', 'within digital', 'work', 'work in', 'work on', 'work to', 'work with', 'working', 'works', 'write', 'you', 'you will', 'your', 'your role']
In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob

# Load the datasets
job_data = pd.read_csv('job_descriptions.csv')
candidate_data = pd.read_csv('candidate_profiles.csv')
company_data = pd.read_csv('company_profiles.csv')

# Analyze skill gaps and match candidate profiles to job requirements
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
X_job = vectorizer.fit_transform(job_data['Job Description'].dropna())
job_skills = vectorizer.get_feature_names_out()

X_candidate = vectorizer.transform(candidate_data['Skills'].dropna())
candidate_skills = vectorizer.get_feature_names_out()

# Calculate cosine similarity between job requirements and candidate skills
similarity_matrix = cosine_similarity(X_job, X_candidate)

# Identify matching candidates for each job
for i, job_row in job_data.iterrows():
    job_title = job_row['Job Title']
    job_skills = vectorizer.transform([job_row['Job Description']])
    job_candidates = candidate_data.loc[similarity_matrix[i] > 0.7]  # Adjust similarity threshold as needed
    print(f"Matching candidates for {job_title}: {', '.join(job_candidates['Name'])}")

# Identify emerging skills and responsibilities
# Your code for analyzing job descriptions and identifying emerging skills and responsibilities goes here

# Analyze company profiles
# Your code for analyzing company profiles and understanding company reputation, values, and culture goes here

# Assessing company branding impact
# Your code for analyzing the impact of company branding on candidate attraction and retention goes here

# Identifying alignment between company values and candidate preferences
# Your code for comparing company values with candidate preferences and identifying alignment goes here
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[3], line 8
      6 # Load the datasets
      7 job_data = pd.read_csv('job_descriptions.csv')
----> 8 candidate_data = pd.read_csv('candidate_profiles.csv')
      9 company_data = pd.read_csv('company_profiles.csv')
     11 # Analyze skill gaps and match candidate profiles to job requirements

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:948, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
    935 kwds_defaults = _refine_defaults_read(
    936     dialect,
    937     delimiter,
   (...)
    944     dtype_backend=dtype_backend,
    945 )
    946 kwds.update(kwds_defaults)
--> 948 return _read(filepath_or_buffer, kwds)

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:611, in _read(filepath_or_buffer, kwds)
    608 _validate_names(kwds.get("names", None))
    610 # Create the parser.
--> 611 parser = TextFileReader(filepath_or_buffer, **kwds)
    613 if chunksize or iterator:
    614     return parser

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1448, in TextFileReader.__init__(self, f, engine, **kwds)
   1445     self.options["has_index_names"] = kwds["has_index_names"]
   1447 self.handles: IOHandles | None = None
-> 1448 self._engine = self._make_engine(f, self.engine)

File ~\anaconda3\Lib\site-packages\pandas\io\parsers\readers.py:1705, in TextFileReader._make_engine(self, f, engine)
   1703     if "b" not in mode:
   1704         mode += "b"
-> 1705 self.handles = get_handle(
   1706     f,
   1707     mode,
   1708     encoding=self.options.get("encoding", None),
   1709     compression=self.options.get("compression", None),
   1710     memory_map=self.options.get("memory_map", False),
   1711     is_text=is_text,
   1712     errors=self.options.get("encoding_errors", "strict"),
   1713     storage_options=self.options.get("storage_options", None),
   1714 )
   1715 assert self.handles is not None
   1716 f = self.handles.handle

File ~\anaconda3\Lib\site-packages\pandas\io\common.py:863, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    858 elif isinstance(handle, str):
    859     # Check whether the filename is to be opened in binary mode.
    860     # Binary mode does not support 'encoding' and 'newline'.
    861     if ioargs.encoding and "b" not in ioargs.mode:
    862         # Encoding
--> 863         handle = open(
    864             handle,
    865             ioargs.mode,
    866             encoding=ioargs.encoding,
    867             errors=errors,
    868             newline="",
    869         )
    870     else:
    871         # Binary mode
    872         handle = open(handle, ioargs.mode)

FileNotFoundError: [Errno 2] No such file or directory: 'candidate_profiles.csv'
In [5]:
import pandas as pd

import matplotlib.pyplot as plt


# Load the dataset
data = pd.read_csv('job_descriptions.csv')

# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()

# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 13
     10 all_descriptions = ' '.join(data['Job Description'].dropna())
     12 # Generate a word cloud
---> 13 wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)
     15 # Plot the word cloud
     16 plt.figure(figsize=(10, 6))

NameError: name 'WordCloud' is not defined
In [6]:
import pandas as pd

import matplotlib.pyplot as plt


# Load the dataset
data = pd.read_csv('job_descriptions.csv')

# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())



# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()

# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[6], line 2
      1 import pandas as pd
----> 2 from wordcloud import WordCloud
      3 import matplotlib.pyplot as plt
      4 from textblob import TextBlob

ModuleNotFoundError: No module named 'wordcloud'
In [7]:
pip install wordcloud
Collecting wordcloud
  Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Requirement already satisfied: numpy>=1.6.1 in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (1.26.4)
Requirement already satisfied: pillow in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (10.2.0)
Requirement already satisfied: matplotlib in c:\users\pc\anaconda3\lib\site-packages (from wordcloud) (3.8.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (23.1)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\pc\anaconda3\lib\site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\pc\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl (300 kB)
   ---------------------------------------- 0.0/300.2 kB ? eta -:--:--
   -- ------------------------------------ 20.5/300.2 kB 330.3 kB/s eta 0:00:01
   ----- --------------------------------- 41.0/300.2 kB 495.5 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/300.2 kB 819.2 kB/s eta 0:00:01
   -------------- ----------------------- 112.6/300.2 kB 819.2 kB/s eta 0:00:01
   ---------------------------- --------- 225.3/300.2 kB 981.9 kB/s eta 0:00:01
   ---------------------------------------  297.0/300.2 kB 1.1 MB/s eta 0:00:01
   -------------------------------------- 300.2/300.2 kB 976.2 kB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Note: you may need to restart the kernel to use updated packages.
In [10]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob

# Load the dataset
data = pd.read_csv('job_descriptions.csv')

# Concatenate all job descriptions into a single string
all_descriptions = ' '.join(data['Job Description'].dropna())

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_descriptions)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Job Descriptions')
plt.axis('off')
plt.show()

# Analyze sentiment of job descriptions
sentiments = data['Job Description'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Plot the distribution of sentiment scores
plt.figure(figsize=(8, 6))
plt.hist(sentiments, bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Scores in Job Descriptions')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()
No description has been provided for this image
No description has been provided for this image
In [13]:
data.describe()
Out[13]:
Job Id latitude longitude Company Size
count 1.615940e+06 1.615940e+06 1.615940e+06 1.615940e+06
mean 1.548935e+15 1.937743e+01 1.639926e+01 7.370467e+04
std 8.946722e+14 2.355690e+01 7.066762e+01 3.529886e+04
min 1.817948e+11 -4.090060e+01 -1.751982e+02 1.264600e+04
25% 7.740508e+14 5.152100e+00 -1.531010e+01 4.311400e+04
50% 1.547858e+15 1.807080e+01 1.914510e+01 7.363300e+04
75% 2.323729e+15 3.907420e+01 4.757690e+01 1.043000e+05
max 3.099618e+15 7.170690e+01 1.780650e+02 1.348340e+05
In [ ]:
data.columns;
In [23]:
from collections import Counter
import re
from nltk.corpus import stopwords

# Download stopwords 
import nltk
nltk.download('stopwords')

# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
words = re.findall(r'\w+', all_descriptions.lower())
filtered_words = [word for word in words if word not in stop_words]

# Get the most common words
word_counts = Counter(filtered_words)
common_words = word_counts.most_common(20)

# Plot the most common words
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.title('Most Common Words in Job Descriptions')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
No description has been provided for this image
In [24]:
skills = ['Python', 'Excel', 'SQL', 'Java', 'C++', 'JavaScript', 'R', 'Machine Learning', 'Deep Learning']
skill_counts = {skill: all_descriptions.lower().count(skill.lower()) for skill in skills}

# Plot skill frequencies
plt.figure(figsize=(10, 6))
plt.bar(skill_counts.keys(), skill_counts.values(), color='skyblue')
plt.title('Skill Frequency in Job Descriptions')
plt.xlabel('Skills')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [26]:
# Plot the number of job postings per country
country_counts = data['Country'].value_counts()

plt.figure(figsize=(10, 6))
country_counts.plot(kind='bar', color='skyblue')
plt.title('Job Postings by Country')
plt.xlabel('Country')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [27]:
# Plot the distribution of job types
job_type_counts = data['Work Type'].value_counts()

plt.figure(figsize=(10, 6))
job_type_counts.plot(kind='bar', color='skyblue')
plt.title('Distribution of Job Types')
plt.xlabel('Job Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [29]:
import seaborn as sns

# Remove NaN values for plotting
experience_salary_data = data[['Experience', 'Salary Range']].dropna()

plt.figure(figsize=(10, 6))
sns.regplot(x='Experience', y='Salary Range', data=experience_salary_data, scatter_kws={'color': 'skyblue'}, line_kws={'color': 'red'})
plt.title('Experience vs Salary Range')
plt.xlabel('Years of Experience')
plt.ylabel('Salary Range')
plt.show()
No description has been provided for this image
In [30]:
# For the heatmap, we'll need a pivot table
country_job_counts = data.pivot_table(index='Country', columns='Job Title', aggfunc='size', fill_value=0)

plt.figure(figsize=(14, 10))
sns.heatmap(country_job_counts, cmap='Blues', annot=True, fmt='d')
plt.title('Heatmap of Job Postings by Country and Job Title')
plt.xlabel('Job Title')
plt.ylabel('Country')
plt.show()
No description has been provided for this image
In [31]:
plt.figure(figsize=(10, 6))
sns.violinplot(x='Work Type', y=sentiments, data=data, inner='quartile', palette='pastel')
plt.title('Sentiment Distribution by Job Type')
plt.xlabel('Job Type')
plt.ylabel('Sentiment Score')
plt.show()
No description has been provided for this image
In [32]:
data['Job Posting Date'] = pd.to_datetime(data['Job Posting Date'])
data.set_index('Job Posting Date', inplace=True)

# Resample to get job postings per month
monthly_postings = data.resample('M').size()

plt.figure(figsize=(12, 6))
plt.plot(monthly_postings.index, monthly_postings, marker='o', color='skyblue')
plt.title('Job Postings Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Postings')
plt.grid(True)
plt.show()
No description has been provided for this image
In [ ]: